In [1]:
from collections import OrderedDict # For recording the model specification
import pandas as pd # For file input/output
import numpy as np # For vectorized math operations
import pylogit as pl # For MNL model estimation
# To convert from wide to long format
In [2]:
# Load the raw swiss metro data
# Note the .dat files are tab delimited text files
swissmetro_wide = pd.read_table("../data/swissmetro.dat", sep='\t')
Note that the 01Logit.py file provided is an example from Python Biogeme (see: http://biogeme.epfl.ch/examples_swissmetro.html). See http://www.strc.ch/conferences/2001/bierlaire1.pdf for a detailed explanation of the variables. The 01Logit.py file excludes observations meeting the following critera:
exclude = (( PURPOSE != 1 ) * ( PURPOSE != 3 ) + ( CHOICE == 0 )) > 0As a result, their dataset has 6,768 observations. Below, I make the same exclusions.
In [3]:
# Select obervations whose choice is known (i.e. CHOICE != 0)
# **AND** whose PURPOSE is either 1 or 3
include_criteria = (swissmetro_wide.PURPOSE.isin([1, 3]) &
(swissmetro_wide.CHOICE != 0))
# Use ".copy()" so that later on, we avoid performing operations
# on a view of a dataframe as opposed to on an actual dataframe
clean_sm_wide = swissmetro_wide.loc[include_criteria].copy()
# Look at how many observations we have after removing unwanted
# observations
final_num_obs = clean_sm_wide.shape[0]
num_obs_statement = "The cleaned number of observations is {:,.0f}."
print (num_obs_statement.format(final_num_obs))
In [4]:
# Create a custom id column that ignores the fact that this is a
# panel/repeated-observations dataset, and start the "custom_id" from 1
clean_sm_wide["custom_id"] = np.arange(clean_sm_wide.shape[0], dtype=int) + 1
In [5]:
# Look at the columns of the swissmetro data
clean_sm_wide.columns
Out[5]:
In [6]:
# Create the list of individual specific variables
ind_variables = clean_sm_wide.columns.tolist()[:15]
# Specify the variables that vary across individuals **AND**
# across some or all alternatives
alt_varying_variables = {u'travel_time': dict([(1, 'TRAIN_TT'),
(2, 'SM_TT'),
(3, 'CAR_TT')]),
u'travel_cost': dict([(1, 'TRAIN_CO'),
(2, 'SM_CO'),
(3, 'CAR_CO')]),
u'headway': dict([(1, 'TRAIN_HE'),
(2, 'SM_HE')]),
u'seat_configuration': dict([(2, "SM_SEATS")])}
# Specify the availability variables
availability_variables = dict(zip(range(1, 4), ['TRAIN_AV', 'SM_AV', 'CAR_AV']))
# Determine the columns that will denote the
# new column of alternative ids, and the columns
# that denote the custom observation ids and the
# choice column
new_alt_id = "mode_id"
obs_id_column = "custom_id"
choice_column = "CHOICE"
In [7]:
# Perform the desired conversion
long_swiss_metro = pl.convert_wide_to_long(clean_sm_wide,
ind_variables,
alt_varying_variables,
availability_variables,
obs_id_column,
choice_column,
new_alt_id_name=new_alt_id)
# Look at the first 9 rows of the long-format dataframe
long_swiss_metro.head(9).T
Out[7]:
In [8]:
# Scale both the travel time and travel cost by 100
long_swiss_metro["travel_time_hundredth"] = (long_swiss_metro["travel_time"] /
100.0)
# Figure out which rows correspond to train or swiss metro
# alternatives for individuals with GA passes. These individuals face no
# marginal costs for a trip
train_pass_train_alt = ((long_swiss_metro["GA"] == 1) *
(long_swiss_metro["mode_id"].isin([1, 2]))).astype(int)
# Note that the (train_pass_train_alt == 0) term accounts for the
# fact that those with a GA pass have no marginal cost for the trip
long_swiss_metro["travel_cost_hundredth"] = (long_swiss_metro["travel_cost"] *
(train_pass_train_alt == 0) /
100.0)
In [9]:
# Create the model's specification dictionary and variable names dictionary
# NOTE: - Keys should be variables within the long format dataframe.
# The sole exception to this is the "intercept" key.
# - For the specification dictionary, the values should be lists
# or lists of lists. Within a list, or within the inner-most
# list should be the alternative ID's of the alternative whose
# utility specification the explanatory variable is entering.
example_specification = OrderedDict()
example_names = OrderedDict()
# Note that 1 is the id for the Train and 3 is the id for the Car.
# The next two lines are placing alternative specific constants in
# the utility equations for the Train and for the Car. The order
# in which these variables are placed is chosen so the summary
# dataframe which is returned will match that shown in the HTML
# file of the python biogeme example.
example_specification["intercept"] = [3, 1]
example_names["intercept"] = ['ASC Car', 'ASC Train']
# Note that the names used below are simply for consistency with
# the coefficient names given in the Python Biogeme example.
# example_specification["travel_cost_hundredth"] = [[1, 2, 3]]
# example_names["travel_cost_hundredth"] = ['B_COST']
example_specification["travel_cost_hundredth"] = [[1, 2, 3]]
example_names["travel_cost_hundredth"] = ['B_COST']
example_specification["travel_time_hundredth"] = [[1, 2, 3]]
example_names["travel_time_hundredth"] = ['B_TIME']
In [10]:
# Provide the module with the needed input arguments to create
# an instance of the MNL model class
example_mnl = pl.create_choice_model(data=long_swiss_metro,
alt_id_col=new_alt_id,
obs_id_col=obs_id_column,
choice_col=choice_column,
specification=example_specification,
model_type="MNL",
names=example_names)
# Start the model estimation from initial values of all zeros
# i.e. 4 zeros for the 4 coefficients being estimated
example_mnl.fit_mle(np.zeros(4))
In [11]:
# Look at the estimated coefficients and goodness-of-fit statistics
example_mnl.get_statsmodels_summary()
Out[11]:
In [12]:
# Look at robust p-values in case one wants to see them
example_mnl.summary
Out[12]:
My estimation results match those of Python Biogeme.
The Python Biogeme log-likelihood is -5,331.252 and their estimated parameters are:
ASC Car: -0.155 ASC Train: -0.701 B_COST: -1.08 B_TIME: -1.28
As shown above, my log-likelihood is -5,331.252, and my estimated parameters are:
ASC Car: -0.1546 ASC Train: -0.7012 B_COST: -1.0838 B_TIME: -1.2779